Leemos los df

library(ggplot2)
# install.packages('pacman')
pacman::p_load(forecast)
dev.new(width=12, height=5)
knitr::opts_chunk$set(fig.width=12, fig.height=5) 

# install.packages('tidyverse')
df_train = read.csv('data/train.csv')
df_train$Date = as.Date(df_train$Date)
df_test = read.csv('data/test.csv')
df_store = read.csv('data/store.csv')
head(df_store)
head(df_train)
head(df_test)

Store con minima distancia

min(df_store[!is.na(df_store$CompetitionDistance),'CompetitionDistance'])
## [1] 20
df_store[df_store$CompetitionDistance==20,]

Obtenemos las stores de test

stores_test = unique(df_test$Store)

#Filtramos del DF las stores de test

df_train_filtered = df_train[df_train$Store %in% stores_test, ]
head(df_train_filtered)

Filtramos la tienda 1

df_train_tienda_1 = df_train_filtered[df_train_filtered$Store==1,]
head(df_train_tienda_1)

Filtramos la tienda 3

df_train_tienda_3 = df_train_filtered[df_train_filtered$Store==3,]
head(df_train_tienda_3)

Chequeo de la cantidad de días con venta en el día domingo

nrow(df_train_filtered[(df_train_filtered$DayOfWeek==7),])
## [1] 110024
nrow(df_train_filtered[(df_train_filtered$DayOfWeek==7)&(df_train_filtered$Sales==0),])
## [1] 107075

Correlación cruzada con convolve de la ventas tienda 1, y clientes

df_train_tienda_1$ventas_clientes = convolve(df_train_tienda_1$Sales, df_train_tienda_1$Customers, conj = TRUE)

ggplot(df_train_tienda_1, aes(x = Date, y = ventas_clientes)) +
    geom_line(color="green", size = 1.2) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
    ggtitle("Cantidad de ventas a lo largo de los últimos años")

Correlación cruzada Clientes Ventas

ggCcf(df_train_tienda_1$Customers, df_train_tienda_1$Sales,lag.max = 300)

# Autocorrelación de la serie de ventas

ggAcf(df_train_tienda_1$Sales, lag.max = length(df_train_tienda_1$Sales))

# Correlación cruzada tienda 1 y 3

ggplot(df_train_tienda_1, aes(x = Date, y = Sales)) +
    geom_line(color="red", size = 1.2) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
    ggtitle("Ventas Tienda 1") +
    xlab("Fecha") +
    ylab("Ventas")

ggCcf(df_train_tienda_1$Sales,df_train_tienda_3$Sales)

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(feasts)
## Loading required package: fabletools
## 
## Attaching package: 'fabletools'
## The following objects are masked from 'package:forecast':
## 
##     accuracy, forecast
y <- tsibble(
  Date = df_train_tienda_1$Date,
  Sales = df_train_tienda_1$Sales,
  index = Date
)
autoplot(y,Sales) +
  labs(title = "Ventas diarias Tienda 1",
       y = "Ventas", x='Fecha')

#Filtrando cuando se encuentra cerrado

melsyd_economy <- df_train_filtered %>%
  filter(Open == 1, Store == 1) %>%
  as_tsibble(index = Date)

autoplot(melsyd_economy, Sales) +
  labs(title = "Ventas - Domingos y feriados filtrados")

melsyd_economy <- df_train_filtered %>%
  filter(Store == 1) %>%
  mutate(Sales = ifelse(Open == 1, Sales, NA),
         Customers = ifelse(Open == 1, Customers, NA),
         Promo = ifelse(Open == 1, Promo, NA),
         StateHoliday = ifelse(Open == 1, StateHoliday, NA),
         SchoolHoliday = ifelse(Open == 1, SchoolHoliday, NA),
         ) %>%
  as_tsibble(index = Date)

Se puede observar cierta estacionalidad a lo largo del año

melsyd_economy %>%
  gg_season(Sales, labels = "both") +
  labs(title = "Seasonal plot")
## Warning: Removed 3 row(s) containing missing values (geom_path).
## Warning: Removed 3 rows containing missing values (geom_text).

Estacionalidad semanal

melsyd_economy %>% gg_season(Sales, period = "week") +
  theme(legend.position = "none") +
  labs(title="Weekly Seasonality")
## Warning: Removed 141 row(s) containing missing values (geom_path).

Boxplots estacionalidad semanal

ggplot(melsyd_economy %>%
  filter(Open==1) %>%
    mutate(DayOfWeek = as.factor(DayOfWeek)),  aes(x= DayOfWeek, y=Sales, color = DayOfWeek)) + 
  geom_boxplot()+
  labs(title = "Ventas diarias Tienda 1",
       y = "Ventas", x='Día de la semana')

# La mediana y los extramos de la caja se comportan de manera distinta dependiendo el día de la semana

Estacionalidad anual

melsyd_economy %>% gg_season(Sales, period = "year") +
  theme(legend.position = "none") +
  labs(title="Year Seasonality")
## Warning: Removed 3 row(s) containing missing values (geom_path).

#Subseries Plot, estacionalidad dentro de la semana

melsyd_economy %>%
  gg_subseries(Sales,period = 'week') +
  labs(title = "Ventas por día de la semana en Tienda 1",
       y = "Ventas", x='Día de la semana')
## Warning: Removed 134 row(s) containing missing values (geom_path).
## Warning: Removed 134 rows containing missing values (geom_hline).

Venta a lo largo del día del mes

melsyd_economy %>%
  gg_subseries(Sales,period = 'month') +
  labs(title = "Ventas por día del mes en Tienda 1",
       y = "Ventas", x='Día del mes')
## Warning: Removed 1 row(s) containing missing values (geom_path).

Boxplot Estacionalidad a lo largo del mes

ggplot(melsyd_economy %>%
  filter(Open==1) %>%
    mutate(DayOfMonth = as.factor(lubridate::day(Date))),  aes(x= DayOfMonth, y=Sales, color = DayOfMonth)) + 
  geom_boxplot()+
  labs(title = "Ventas por día del mes para la Tienda 1",
       y = "Ventas", x='Día del mes')

Días para que termine el mes

ggplot(melsyd_economy %>%
  filter(Open==1) %>%
    mutate(DayToEndOfMonth = as.factor(lubridate::days_in_month(Date) - lubridate::day(Date))),  aes(x= DayToEndOfMonth, y=Sales, color = DayToEndOfMonth)) + 
  geom_boxplot()

# Cuando faltan pocos días para que termine el mes la venta es alta.

Día posterior a día cerrado

ggplot(melsyd_economy %>%
    mutate(Abierto_Dia_Anterior = as.factor(lag(Open))) %>%
    filter(Open==1) ,  aes(x= Abierto_Dia_Anterior, y=Sales, color = Abierto_Dia_Anterior)) + 
  geom_boxplot()+
  labs(title = "Ventas diarias Tienda 1",
       y = "Ventas", x='Se encontraba abierto el día anterior?')

Día posterior a día cerrado, excluyendo lunes, dado que todos los domingos se encuentra cerrado

ggplot(melsyd_economy %>%
    mutate(Abierto_Dia_Anterior = as.factor(lag(Open))) %>%
    filter(Open==1, DayOfWeek!=1) ,  aes(x= Abierto_Dia_Anterior, y=Sales, color = Abierto_Dia_Anterior)) + 
  geom_boxplot()

# Boxplots Efecto de vacaciones de escuela

ggplot(melsyd_economy %>%
  filter(Open==1) %>%
    mutate(SchoolHoliday = as.factor(SchoolHoliday)),  aes(x= SchoolHoliday, y=Sales, color = SchoolHoliday)) + 
  geom_boxplot()+
  labs(title = "Ventas diarias Tienda 1",
       y = "Ventas", x='Ese día es vacaciones escolares?')

# La mediana parece ser más baja, no obstante hay mucho solapamiento

melsyd_economy %>%
  autoplot(Customers) +
  labs(title = "Clientes diarios Tienda 1",
       y = "Clientes", x='Fecha')
## Warning: Removed 1 row(s) containing missing values (geom_path).

Scatter Plot Clientes y Ventas

melsyd_economy %>%
  ggplot(aes(x = Customers, y = Sales)) +
  geom_point()
## Warning: Removed 161 rows containing missing values (geom_point).

Alta correlación entre clientes y ventas

cor(df_train_tienda_1$Customers, df_train_tienda_1$Sales)
## [1] 0.9843412
cor(df_train_tienda_1$Customers, df_train_tienda_1$Sales, method = 'spearman')
## [1] 0.9554348

Lag Plots

Scatter plot contra diferentes lags, a medida que nos alejamos en el tiempo, se encuentran menos relacionadas

melsyd_economy %>%
  gg_lag(Sales, geom = "point") +
  labs(x = "lag(Sales, k)")
## Warning: Removed 161 rows containing missing values (gg_lag).

Scatter Plot para diferentes lags

melsyd_economy %>%
  gg_lag(Sales, geom = "point", lags = c(1,2,3,7,14,30,365)   ) +
  labs(x = "lag(Sales, k)")
## Warning: Removed 161 rows containing missing values (gg_lag).

Autocorrelacion 40 lags

melsyd_economy %>%
  ACF(Sales, lag_max = 40, na.action = na.pass) %>%
  autoplot() + labs(title="Sales")

Autocorrelacion 400 lags

melsyd_economy %>%
  ACF(Sales, lag_max = 400, na.action = na.pass) %>%
  autoplot() + labs(title="Sales")

# Para lags pequeños tenemos una alta correlación, cada 7 días se ven picos, lo que indica que hay cierta estacionalidad # También vemos un pico grande aproximadamente al año # Cuando hay tendencia, suele haber una autocorrelación alta y positiva en los lags pequeños, valores cercanos en tiempo, cercanos también en valor # Cuando la data es estacional, las autocorrelaciones son altas para los lags estacionales (en múltiplos de dichos lags)

Descomponemos la serie en Tendencia, Estacionalidad Anual y Estacionalidad Semanal

melsyd_economy_2 <- df_train_filtered %>%
  filter(Store == 1) %>%
  # mutate(Sales = ifelse(Open == 1, Sales, NA),
  #        Customers = ifelse(Open == 1, Customers, NA),
  #        Promo = ifelse(Open == 1, Promo, NA),
  #        StateHoliday = ifelse(Open == 1, StateHoliday, NA),
  #        SchoolHoliday = ifelse(Open == 1, SchoolHoliday, NA),
  #        ) %>%
  as_tsibble(index = Date)

dcmp <- melsyd_economy_2 %>%
  model(stl = STL(Sales))
components(dcmp)
melsyd_economy_2 %>% autoplot(Sales)

components(dcmp) %>% autoplot()

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Sales, colour="gray") +
  geom_line(aes(y=trend), colour = "#D55E00") +
  labs(

    title = "Sales"
  )

Datos con estacionalidad ajustada

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Sales, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(title = "Ventas diarias Tienda 1 <estacionalidad ajustada>",
       y = "Ventas", x='Fecha')

Moving Average 7

# Considerando la serie completa
melsyd_economy_3 <- melsyd_economy_2 %>%
  mutate(
    `7-MA` = slider::slide_dbl(Sales, mean,
                .before = 3, .after = 3, .complete = TRUE)
  )

melsyd_economy_3 %>%
  autoplot(Sales) +
  geom_line(aes(y = `7-MA`), colour = "#D55E00") +
  labs(y = "Ventas",
       title = "Ventas diarias Tienda 1") +
  guides(colour = guide_legend(title = "series"))
## Warning: Removed 6 row(s) containing missing values (geom_path).

# Filtrando cuando se encuentra cerrado
melsyd_economy_3 <- melsyd_economy %>%
  mutate(
    `7-MA` = slider::slide_dbl(Sales, mean,
                .before = 3, .after = 3, .complete = TRUE)
  )


melsyd_economy_3 %>%
  autoplot(Sales) +
  geom_line(aes(y = `7-MA`), colour = "#D55E00") +
  labs(y = "Ventas",
       title = "Ventas diarias Tienda 1") +
  guides(colour = guide_legend(title = "series"))
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 942 row(s) containing missing values (geom_path).

Se puede calcular tendencia a partir de dos filtros moving average, en este caso, uno de 7 para la semana, y despues uno de 52 para el año (*) habría que chequearlo…

melsyd_economy_4 <- melsyd_economy_2 %>%
  mutate(
    `12-MA` = slider::slide_dbl(Sales, mean,
                .before = 3, .after = 3, .complete = TRUE),
    `2x12-MA` = slider::slide_dbl(`12-MA`, mean,
                .before = 26, .after = 26, .complete = TRUE)
  )
melsyd_economy_4 %>%
  autoplot(Sales, colour = "gray") +
  geom_line(aes(y = `2x12-MA`), colour = "#D55E00") +
  labs(y = "Ventas",
       title = "Ventas diarias Tienda 1 <tendencia como suma de dos filtros MA>")
## Warning: Removed 58 row(s) containing missing values (geom_path).

Descomposición clásica

melsyd_economy_2 %>%
  model(
    classical_decomposition(Sales, type = "additive")
    # classical_decomposition(Sales~season(365), type = "additive")
  ) %>%
  components() %>%
  autoplot() +
  labs(title = "Descomposición lineal aditiva para las ventas Tienda 1")
## Warning: Removed 3 row(s) containing missing values (geom_path).

Descomposiciones clásicas

# install.packages("seasonal")
library(seasonal)
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
## 
##     view
# No funca
# x11_dcmp <- melsyd_economy_2[,c('Date','Sales')] %>%
#   model(x11 = X_13ARIMA_SEATS(Sales ~ x11(na.action = seasonal::na.x13))) %>%
#   components()
# autoplot(x11_dcmp) +
#   labs(title =
#     "Decomposition of total US retail employment using X-11.")
# 
# sum(is.na(melsyd_economy_2$Sales))

STL Decomposition, varias ventajas, lo único, no considera estacionalidad intradiaria o dentro del calendario

melsyd_economy_2 %>%
  model(
    STL(Sales ~ trend(window = 360) +
                   season(window = "periodic"),
    robust = TRUE)) %>%
  components() %>%
  autoplot()+
  labs(title = "Descomposición STL para las ventas Tienda 1")

# STL Features

melsyd_economy_2 %>%
  features(Sales, feat_stl)

BenchMark Models

Filter Data for train

df_train_tienda_1_train <- melsyd_economy_2 %>%
  filter(Date<'2015-01-01')
# install.packages('fable')
library(fable)

Mean

df_train_tienda_1_train %>% model(MEAN(Sales))

Naive last value

df_train_tienda_1_train %>% model(NAIVE(Sales))

Naive last cycle (seasonal) Year

df_train_tienda_1_train %>% model(SNAIVE(Sales ~ lag("year")))
## Warning: Non-integer lag orders for random walk models are not supported.
## Rounding to the nearest integer.

Naive last cycle (seasonal) Week

df_train_tienda_1_train %>% model(SNAIVE(Sales ~ lag("week")))

Drift

df_train_tienda_1_train %>% model(RW(Sales ~ drift()))

Different Models including 0

# Set training data from 1992 to 2006
# train <- aus_production %>%
#   filter_index("1992 Q1" ~ "2006 Q4")
# Fit the models
beer_fit <- df_train_tienda_1_train %>%
  model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales)
  )
# Generate forecasts for 14 quarters
beer_fc <- beer_fit %>% forecast(h = 42)
# Plot forecasts against actual values
beer_fc %>%
  autoplot(df_train_tienda_1_train, level = NULL) +
  autolayer( df_train_tienda_1_train %>%
    filter(Date>="2015-01-01") %>%
      select(Sales),
    colour = "black"
  ) +
  labs(
    title = "Forecasts para las ventas Tienda 1"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Sales`

Different Models Zeros as Null

# Set training data from 1992 to 2006
# train <- aus_production %>%
#   filter_index("1992 Q1" ~ "2006 Q4")
# Fit the models
beer_fit <- melsyd_economy %>%
  model(
    # Actual_Sales = Sales,
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve (week)` = SNAIVE(Sales ~ lag("week")),
    `Seasonal naïve (year)` = SNAIVE(Sales ~ lag("year")),
    
                              )
## Warning: Non-integer lag orders for random walk models are not supported.
## Rounding to the nearest integer.
# Generate forecasts for 14 quarters
beer_fc <- beer_fit %>% forecast(h = 42)
## Warning: Non-integer lag orders for random walk models are not supported.
## Rounding to the nearest integer.
# Plot forecasts against actual values
beer_fc %>%
  autoplot(melsyd_economy 
            # %>% filter(Date>="2014-10-20")
             , level = NULL) +
  autolayer( melsyd_economy %>%
    filter(Date>="2015-01-01") %>%
      select(Sales),
    colour = "black"
  ) +
  labs(
    title = "Forecasts para las ventas Tienda 1"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Sales`
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 row(s) containing missing values (geom_path).

Simple Models

# Set training data from 1992 to 2006
# train <- aus_production %>%
#   filter_index("1992 Q1" ~ "2006 Q4")
# Fit the models
beer_fit <- melsyd_economy %>%
  filter(Date<"2015-01-01") %>%
  model(
    # Actual_Sales = Sales,
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve (week)` = SNAIVE(Sales ~ lag("week")),
    # `Seasonal naïve (year)` = SNAIVE(Sales ~ lag("year")),
    
                              )
# Generate forecasts for 14 quarters
beer_fc <- beer_fit %>% forecast(h = 42)
# Plot forecasts against actual values
beer_fc %>%
  autoplot(melsyd_economy 
            %>% filter(Date>="2014-10-20")
             , level = NULL) +
  autolayer( melsyd_economy %>%
    filter(Date>="2015-01-01") %>%
      select(Sales),
    colour = "black"
  ) +
  labs(
    title = "Forecasts para las ventas Tienda 1"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Sales`
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 1 row(s) containing missing values (geom_path).

Ajustando el formato

df_train_tienda_1 <- df_train_tienda_1 %>%
  as_tsibble(index = Date) 

Simple Models

# Set training data from 1992 to 2006
# train <- aus_production %>%
#   filter_index("1992 Q1" ~ "2006 Q4")
# Fit the models
beer_fit <- df_train_tienda_1 %>%
  filter(Date<"2015-01-01") %>%
  model(
    # Actual_Sales = Sales,
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve (week)` = SNAIVE(Sales ~ lag("week")),
    # `Seasonal naïve (year)` = SNAIVE(Sales ~ lag("year")),
    
                              )
# Generate forecasts for 14 quarters
beer_fc <- beer_fit %>% forecast(h = 42)
# Plot forecasts against actual values
beer_fc %>%
  autoplot(df_train_tienda_1 
            %>% filter(Date>="2014-10-20")
             , level = NULL) +
  autolayer( df_train_tienda_1 %>%
    filter(Date>="2015-01-01") %>%
      select(Sales),
    colour = "black"
  ) +
  labs(
    title = "Forecasts para las ventas Tienda 1"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Sales`

Para ver los valoes del fit, residuos y residuos innovados (cambio de escala)

augment(beer_fit)

Residuos

aug <- df_train_tienda_1 %>%
  model(SNAIVE(Sales ~ lag("week"))) %>%
  augment()
autoplot(aug, .innov) +
  labs(title = "Residuos del método naïve")
## Warning: Removed 7 row(s) containing missing values (geom_path).

Histograma Residuos

aug %>%
  ggplot(aes(x = .innov)) +
  geom_histogram() +
  labs(title = "Histograma de los residuos")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7 rows containing non-finite values (stat_bin).

Autocorrelacion de los residuos

aug %>%
  ACF(.innov) %>%
  autoplot() +
  labs(title = "Residuos del método naïve")

# Se pueden ver algunas autocorrelaciones altas.

Tres gráficos en 1, Residuos a lo largo del tiempo, autocorrelación e histograma

df_train_tienda_1 %>%
  model(SNAIVE(Sales ~ lag("week"))) %>%
  gg_tsresiduals()
## Warning: Removed 7 row(s) containing missing values (geom_path).
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing non-finite values (stat_bin).

Intervalos de predicción

df_train_tienda_1 %>%
  model(SNAIVE(Sales ~ lag("week"))) %>%
  forecast(h = 42) %>%
  autoplot(df_train_tienda_1) +
  labs(title="Forecast de Ventas <intervalos de predicción SNAIVE>", y="$US" )

Intervalos de predicción

melsyd_economy %>%
  model(SNAIVE(Sales ~ lag("week"))) %>%
  forecast(h = 42) %>%
  autoplot(melsyd_economy) +
  labs(title="Forecast de ventas <Intervalos de predicción SNAIVE>", y="Ventas" )
## Warning: Removed 1 row(s) containing missing values (geom_path).

Intervalos de predicción

melsyd_economy %>%
  model(NAIVE(Sales)) %>%
  forecast(h = 42) %>%
  autoplot(melsyd_economy) +
  labs(title="Forecast de ventas <Intervalos de predicción SNAIVE>", y="Ventas" )
## Warning: Removed 1 row(s) containing missing values (geom_path).

Forecast for the last 42 days (6 weeks)

recent_production <- melsyd_economy %>%
  slice(n()-42:0)
beer_train <- melsyd_economy %>%
  slice(1:(n()-43))

beer_fit <- beer_train %>%
  model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales),
    Drift = RW(Sales ~ drift())
  )

beer_fc <- beer_fit %>%
  forecast(h = 42)

beer_fc %>%
  autoplot(
    melsyd_economy %>% slice(n()-134:0),
    level = NULL
  ) +
  labs(title="Forecast de ventas", y="Ventas" ) +
  guides(colour = guide_legend(title = "Forecast"))

Accuracy metrics

accuracy(beer_fc, recent_production)

Forecast for the last 42 days (6 weeks)

recent_production <- melsyd_economy_2 %>%
  slice(n()-42:0)
beer_train <- melsyd_economy_2 %>%
  slice(1:(n()-43))

beer_fit <- beer_train %>%
  model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales),
    Drift = RW(Sales ~ drift())
  )

beer_fc <- beer_fit %>%
  forecast(h = 42)

beer_fc %>%
  autoplot(
    melsyd_economy_2 %>% slice(n()-134:0),
    level = NULL
  ) +
  labs(title="Forecast de ventas", y="Ventas" ) +
  guides(colour = guide_legend(title = "Forecast"))

Accuracy metrics evaluación del modelo

accuracy(beer_fc, recent_production)

Walk forward validation (CV cross validation)

Create Training Test Partitions

# Time series cross-validation accuracy
google_2015_tr <- melsyd_economy_2 %>%
  stretch_tsibble(.init = 731, .step = 42) %>% # Toma al menos los primeros dos años y va tomando de a 6 semanas para testear
  relocate(Date, .id)
google_2015_tr

Cross validation

rbind(
# TSCV accuracy
google_2015_tr %>%
  model(RW(Sales ~ drift())) %>%
  forecast(h = 1) %>%
  accuracy(melsyd_economy_2)
,
# Training set accuracy
melsyd_economy_2 %>%
  model(RW(Sales ~ drift())) %>%
  accuracy()
)

Cross validation

# TSCV accuracy
rbind(
google_2015_tr %>%
  model(SNAIVE(Sales)) %>%
  forecast(h = 42) %>%
  accuracy(melsyd_economy_2)
,
# Training set accuracy
melsyd_economy_2 %>%
  model(SNAIVE(Sales)) %>%
  accuracy()
)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 41 observations are missing between 2015-08-01 and 2015-09-10

Regresion Lineal

fit_beer <- recent_production %>%
  model(TSLM(Sales ~ trend() + season()))
report(fit_beer)
## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1046.43  -515.83    11.41   450.16  1420.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4029.557    346.809  11.619 1.45e-13 ***
## trend()          13.787      8.445   1.633    0.112    
## season()week2   -73.866    379.883  -0.194    0.847    
## season()week3   -96.233    395.873  -0.243    0.809    
## season()week4 -4312.186    395.061 -10.915 8.16e-13 ***
## season()week5   499.527    394.429   1.266    0.214    
## season()week6   119.907    393.976   0.304    0.763    
## season()week7  -123.713    393.705  -0.314    0.755    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 681.8 on 35 degrees of freedom
## Multiple R-squared: 0.8627,  Adjusted R-squared: 0.8353
## F-statistic: 31.43 on 7 and 35 DF, p-value: 2.783e-13

Regresion Lineal

melsyd_economy_rl = melsyd_economy_2 %>%
  filter(Date<"2015-06-19")
melsyd_economy_rl$DayOfWeek = as.factor(melsyd_economy_rl$DayOfWeek)

fit_beer <- melsyd_economy_rl %>%
  model(TSLM(Sales ~ Open * (DayOfWeek + Date) - Open - DayOfWeek - Date))
  # model(TSLM(Sales ~ Date))
report(fit_beer)
## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2366.6  -611.4     0.0   403.3  4298.4 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -3.155e-12  7.237e+01   0.000  1.00000    
## Open:DayOfWeek1  1.118e+04  2.066e+03   5.412 8.03e-08 ***
## Open:DayOfWeek2  1.068e+04  2.066e+03   5.170 2.89e-07 ***
## Open:DayOfWeek3  1.056e+04  2.066e+03   5.109 3.96e-07 ***
## Open:DayOfWeek4  1.044e+04  2.065e+03   5.057 5.16e-07 ***
## Open:DayOfWeek5  1.073e+04  2.064e+03   5.199 2.49e-07 ***
## Open:DayOfWeek6  1.096e+04  2.065e+03   5.306 1.41e-07 ***
## Open:DayOfWeek7         NA         NA      NA       NA    
## Open:Date       -3.703e-01  1.277e-01  -2.900  0.00382 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 900.9 on 891 degrees of freedom
## Multiple R-squared: 0.8047,  Adjusted R-squared: 0.8031
## F-statistic: 524.3 on 7 and 891 DF, p-value: < 2.22e-16
recent_production <- melsyd_economy_2 %>%
  slice(n()-42:0)
recent_production$DayOfWeek = as.factor(recent_production$DayOfWeek)

fc_beer <- forecast(fit_beer, new_data = recent_production)
## Warning: prediction from a rank-deficient fit may be misleading
fc_beer %>%
  autoplot(recent_production) +
  labs(
    title = "Forecast de ventas usando regresión"
  )

Métricas del modelo

accuracy(fc_beer, recent_production)

Exponential Smoothing

# # install.packages('forecast')
# library(Rcpp)
# library(forecast)
# # Estimate parameters
fit <- melsyd_economy_2 %>%
  filter(Date<"2015-06-19") %>%
  model(fable::ETS(Sales ~ error("A") + trend("N") + season("N")))
fc <- fit %>%
  forecast(h = 42)

Plot

fc %>%
  autoplot(melsyd_economy_2 %>%
             slice(n()-126:0)) +
  labs(title="Exponential Smoothing") +
  guides(colour = "none")

Métricas del modelo

accuracy(fc, recent_production )

Modelos con tendencia y estacionalidad aditiva y multiplicativa

aus_holidays <- melsyd_economy_2 %>%
  filter(Date<"2015-06-19") %>%
  summarise(Sales = sum(Sales)/1e3)

fit <- aus_holidays %>%
  model(
    additive = ETS(Sales ~ error("A") + trend("A") +
                                                season("A")),
    multiplicative = ETS(Sales ~ error("M") + trend("A") +
                                                season("M"))
  )
fc <- fit %>% forecast(h = 42)
fc %>%
  autoplot(aus_holidays %>%
           slice(n()-126:0), level = NULL) +
  labs(title="Sales <Estacionalidad Holts Winters> - Additive and Multiplicative") +
  guides(colour = guide_legend(title = "Forecast"))

Métricas del modelo

accuracy(fc, recent_production %>%
           mutate(Sales = Sales)  )

Efecto Multiplicativo

sth_cross_ped <- melsyd_economy_2 %>%
  summarise(Sales = sum(Sales)/1e3)

sth_cross_ped %>%
  filter(Date<"2015-06-19") %>%
  model(
    hw = ETS(Sales ~ error("M") + trend("Ad") + season("M"))
  ) %>%
  forecast(h = 42) %>%
  autoplot(sth_cross_ped %>% slice(n()-126:0)) +
  labs(title = "Estacionalidad Holts - Winters Multiplicativo")

Descomposición del modelo Holts-Winters en Nivel, Tendencia, Estacionalidad

dcmp <- melsyd_economy_2 %>%
  summarise(Sales = sum(Sales)/1e3) %>%
    model(
    hw = ETS(Sales ~ error("M") + trend("Ad") + season("M"))
  ) 
components(dcmp)
melsyd_economy_2 %>% autoplot(Sales)

components(dcmp) %>% autoplot()
## Warning: Removed 7 row(s) containing missing values (geom_path).

Descomposición del modelo Tendancia y Estacionalidad aditivas en Nivel, Tendencia, Estacionalidad

dcmp <- melsyd_economy_2 %>%
  summarise(Sales = sum(Sales)/1e3) %>%
    model(
    hw = ETS(Sales ~ error("A") + trend("Ad") + season("A"))
  ) 
components(dcmp)
melsyd_economy_2 %>% autoplot(Sales)

components(dcmp) %>% autoplot()
## Warning: Removed 7 row(s) containing missing values (geom_path).

ARIMA

melsyd_economy_2 %>%
  mutate(diff_sales = difference(Sales)) %>%
  features(diff_sales, ljung_box, lag = 10)
melsyd_economy_2 %>%
  summarise(Sales = sum(Sales)/1e6) %>%
  transmute(
    `Sales ($million)` = Sales,
    `Log sales` = log(Sales),
    `Annual change in log sales` = difference(log(Sales), 365),
    `Doubly differenced log sales` =
                     difference(difference(log(Sales), 365), 1)
  ) %>%
  pivot_longer(-Date, names_to="Type", values_to="Sales") %>%
  mutate(
    Type = factor(Type, levels = c(
      "Sales ($million)",
      "Log sales",
      "Annual change in log sales",
      "Doubly differenced log sales"))
  ) %>%
  ggplot(aes(x = Date, y = Sales)) +
  geom_line() +
  facet_grid(vars(Type), scales = "free_y") +
  labs(title = "Corticosteroid drug sales", y = NULL)

Test de estacionariedad

melsyd_economy_2 %>%
  features(Sales, unitroot_kpss)
melsyd_economy_2 %>%
mutate(diff_close = difference(Sales, 365)) %>%
features(diff_close, unitroot_kpss)
melsyd_economy_2 %>%
mutate(diff_close = difference(Sales, 7)) %>%
features(diff_close, unitroot_kpss)
melsyd_economy_2 %>%
  features(Sales, unitroot_ndiffs)

ARIMA con estacionalidad

Estacionalidad Anual

melsyd_economy_2 %>%
  gg_tsdisplay(difference(Sales, 365),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")
## Warning: Removed 365 row(s) containing missing values (geom_path).
## Warning: Removed 365 rows containing missing values (geom_point).

Estacionalidad semanal

melsyd_economy_2 %>%
  gg_tsdisplay(difference(Sales, 7),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")
## Warning: Removed 7 row(s) containing missing values (geom_path).
## Warning: Removed 7 rows containing missing values (geom_point).

Restando diferencia de orden 1 adicionalmente

melsyd_economy_2 %>%
  gg_tsdisplay(difference(Sales, 28) %>% difference(),
               plot_type='partial', lag=36) +
  labs(title = "Double differenced", y="")
## Warning: Removed 29 row(s) containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_point).

Removiendo Nulos

Estacionalidad Anual

melsyd_economy %>%
  gg_tsdisplay(difference(Sales, 365),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")
## Warning: Removed 366 row(s) containing missing values (geom_path).
## Warning: Removed 552 rows containing missing values (geom_point).

Estacionalidad semanal

melsyd_economy %>%
  gg_tsdisplay(difference(Sales, 7),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")
## Warning: Removed 8 row(s) containing missing values (geom_path).
## Warning: Removed 191 rows containing missing values (geom_point).

Restando diferencia de orden 1 adicionalmente

melsyd_economy_2 %>%
  gg_tsdisplay(difference(Sales, 28) %>% difference(),
               plot_type='partial', lag=36) +
  labs(title = "Double differenced", y="")
## Warning: Removed 29 row(s) containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_point).

# install.packages('fable.prophet')
library(fable.prophet)
## Loading required package: Rcpp
train <- melsyd_economy_2 %>%
  filter(Date<"2015-06-19")
fit <- train %>%
  model(
    arima = ARIMA(Sales),
    ets = ETS(Sales),
    prophet = prophet(Sales ~ season(type = "multiplicative"))
  )
fc <- fit %>% forecast(h = 42)
fc %>% autoplot(melsyd_economy_2 %>%
                  slice(n()-134:0)
                  )

Neural Nets (Redes Neuronales)

melsyd_economy_2 %>%
  filter(Date<"2015-06-19") %>%
  model(NNETAR(Sales)) %>%
  forecast(h = 42, times = 100) %>% # El parámetro times permite correr distintas simulaciones, cambian los intervalos de confianza
  autoplot(melsyd_economy_2 %>%
                  slice(n()-134:0)) +
  labs(x = "Year", y = "Counts",
       title = "Yearly sunspots")

Seleccion del mejor modelo

Walk forward validation (CV cross validation)

Create Training - Val Partitions para WFV

# Time series cross-validation accuracy
df_cv <- melsyd_economy_2 %>%
  slice(0:(n()-85)) %>%
  stretch_tsibble(.init = 731, .step = 42) %>% # Toma al menos los primeros dos años y va tomando de a 6 semanas para testear
  relocate(Date, .id)


print(paste("Cantidad de folds: ", max(df_cv$.id)))
## [1] "Cantidad de folds:  4"
df_cv

Cross validation

rbind(
# TSCV accuracy
df_cv %>%
    model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales),
    Drift = RW(Sales ~ drift()),
    ETS = ETS(Sales),
    # Prophet = prophet(Sales ~ season(type = "multiplicative")),
    Prophet = prophet(Sales),
    # Redes_Neuronales = model(NNETAR(Sales))
  )
 %>%
  forecast(h =42) %>%
  accuracy(melsyd_economy_2) # Filtro sólo los días que estuvo abierto 
# ,
# # Training set accuracy
# melsyd_economy_2 %>%
#   model(RW(Sales ~ drift())) %>%
#   accuracy()
)

Moving Average 7 días antes

Create Training - Val Partitions para WFV

# Time series cross-validation accuracy
df_cv <- melsyd_economy_2 %>%
    mutate(
    Sales = slider::slide_dbl(Sales, mean,
                .before = 6, .after = 0, .complete = FALSE)) %>%
  slice(0:(n()-85)) %>%
  stretch_tsibble(.init = 731, .step = 42) %>% # Toma al menos los primeros dos años y va tomando de a 6 semanas para testear
  relocate(Date, .id)


print(paste("Cantidad de folds: ", max(df_cv$.id)))
## [1] "Cantidad de folds:  4"
df_cv

Cross validation

rbind(
# TSCV accuracy
df_cv %>%
    model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales),
    Drift = RW(Sales ~ drift()),
    ETS = ETS(Sales),
    # Prophet = prophet(Sales ~ season(type = "multiplicative")),
    Prophet = prophet(Sales),
    # Redes_Neuronales = model(NNETAR(Sales))
  )
 %>%
  forecast(h =42) %>%
  accuracy(melsyd_economy_2 ) # Filtro sólo los días que estuvo abierto 
# ,
# # Training set accuracy
# melsyd_economy_2 %>%
#   model(RW(Sales ~ drift())) %>%
#   accuracy()
)

Moving Average 28 días antes

Create Training - Val Partitions para WFV

# Time series cross-validation accuracy
df_cv <- melsyd_economy_2 %>%
    mutate(
    Sales = slider::slide_dbl(Sales, mean,
                .before = 27, .after = 0, .complete = FALSE)) %>%
  slice(0:(n()-85)) %>%
  stretch_tsibble(.init = 731, .step = 11) %>% # Toma al menos los primeros dos años y va tomando de a 6 semanas para testear
  relocate(Date, .id)


print(paste("Cantidad de folds: ", max(df_cv$.id)))
## [1] "Cantidad de folds:  12"
df_cv

Cross validation

rbind(
# TSCV accuracy
df_cv %>%
    model(
    Mean = MEAN(Sales),
    `Naïve` = NAIVE(Sales),
    `Seasonal naïve` = SNAIVE(Sales),
    Drift = RW(Sales ~ drift()),
    ETS = ETS(Sales),
    # Prophet = prophet(Sales ~ season(type = "multiplicative")),
    Prophet = prophet(Sales),
    # Redes_Neuronales = model(NNETAR(Sales))
  )
 %>%
  forecast(h = 42) %>%
  accuracy(melsyd_economy_2 ) # Filtro sólo los días que estuvo abierto 
# ,
# # Training set accuracy
# melsyd_economy_2 %>%
#   model(RW(Sales ~ drift())) %>%
#   accuracy()
)

Resultados en TEST (los últimos 42 días)

melsyd_economy_2 %>%
  slice(0:(n()-42)) %>%
    model(
      Prophet = prophet(Sales)
    )%>%
  forecast(h = 42) %>%
  accuracy(melsyd_economy_2)

Gráfico en Test

melsyd_economy_2 %>%
  slice(0:(n()-42)) %>%
  # filter(Open==1) %>%
    model(
      Prophet = prophet(Sales)
    )%>%
  forecast(h = 42) %>%
  autoplot(melsyd_economy_2 %>%
             # filter(Open==1) %>%
                  slice(n()-134:0)) +
  labs(title = "Ventas últimas 18 semanas y predicciones con Prophet")

Descomposición Prophet

fit <- melsyd_economy_2 %>%
  slice(0:(n()-42)) %>%
  model(
    prophet(Sales) ) 
fit %>%
  components() %>%
  autoplot()

Análisis de residuos

fit %>% gg_tsresiduals()